register '$PIGGYBANK_PATH';

DEFINE DATE_EXTRACT_MM 
org.apache.pig.piggybank.evaluation.util.apachelogparser.DateExtractor('yyyy-MM');

DEFINE DATE_EXTRACT_DD 
org.apache.pig.piggybank.evaluation.util.apachelogparser.DateExtractor('yyyy-MM-dd');

-- pig/input/test-pig-full.txt
raw_logs = load '$INPUT' USING org.apache.pig.piggybank.storage.MyRegExLoader('^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] "(\\S+) (\\S+) (HTTP[^"]+)" (\\S+) (\\S+) "([^"]*)" "([^"]*)" "(\\S+)" "(\\S+)" (\\S+) "(.*)" (\\S+) (\\S+)')
as (remoteAddr: chararray, 
n2: chararray, 
n3: chararray, 
time: chararray, 
method: chararray,
path:chararray,
protocol:chararray,
status: int, 
bytes_string: chararray, 
referrer: chararray, 
browser: chararray, 
n10:chararray,
remoteLogname: chararray, 
remoteAddr12: chararray, 
path2: chararray, 
sessionid: chararray, 
n15: chararray
);

filter_logs = FILTER raw_logs BY not (browser matches '.*pingdom.*');
--item_logs = FOREACH raw_logs GENERATE browser;


all_status_logs = FOREACH filter_logs  GENERATE status;
distinct_status_logs = DISTINCT all_status_logs;
STORE distinct_status_logs into '$OUTPUT' using PigStorage(',');






